This is analysis of mutational signatures in COAD and STAD datasets from ICGC for MMR paper (Meier, Volkova et al. 2017).

Get the data and signatures and contributions

Preparation

source('plotting functions/plot_sigs.R')
source("nmSolve.R")
source("plotting functions/plot_decomposition.R")
source('plotting functions/scatterpie.R')
library(tsne)
library(NMF)
library(devtools)
#devtools::install_github("mg14/mg14") # for plotting purposes
library(mg14)
library(ggplot2)
library(reshape2)
library(VariantAnnotation)
library(deconstructSigs)

Get COSMIC signatures

sp_url <- paste("http://cancer.sanger.ac.uk/cancergenome/assets/",
                "signatures_probabilities.txt", sep = "")
cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE)
cancer_signatures = cancer_signatures[order(cancer_signatures[,1]),]
types <- as.character(cancer_signatures$Trinucleotide) # trinucleotide classes
types.full <- as.character(cancer_signatures$Somatic.Mutation.Type) # substitution types
row.names(cancer_signatures) <- types.full
cancer_signatures = as.matrix(cancer_signatures[,4:33])

Adjust them from exomes:

cancer_signatures_adj <- cancer_signatures
for (i in 1:ncol(cancer_signatures)) {
  cancer_signatures_adj[,i] <- cancer_signatures_adj[,i] / tri.counts.genome[types,1] * tri.counts.exome[types,1]
  cancer_signatures_adj[,i] <- cancer_signatures_adj[,i] / sum(cancer_signatures_adj[,i])
}

Prepare human exome counts (regions taken from Agilent SureSelect V5 Human All Exon, human genome sequence - from hg19 build)

ref_genome="BSgenome.Hsapiens.UCSC.hg19"
library(BSgenome.Hsapiens.UCSC.hg19)
exome <- read.table("S04380110_Covered.bed",header = FALSE, sep="\t",stringsAsFactors=FALSE, skip=2,quote="")
a <- getSeq(get(ref_genome))
a <- a[1:23] # get rid of Y
gr <- as(seqinfo(a), "GRanges") # turn into GRanges object
genome(gr) <- "hg19"
# Get the sequence for well covered exome
exactexomelist <- list()
for (j in 1:23) {
  tmp <- exome[exome$V1==seqlevels(a)[j],]
  exactexomelist[[j]] <- lapply(1:nrow(tmp), function(i)
    a[[j]][tmp$V2[i]:tmp$V3[i]])
}

Prepare C. elegans counts in order to account for trinucleotide content difference:

WBcel235 <- readDNAStringSet("C. elegans data/Caenorhabditis_elegans.WBcel235.dna_sm.toplevel.fa.gz")
worm.trinucleotides <- colSums(trinucleotideFrequency(WBcel235)[-5,])
human.trinucleotides <- as.vector(t(tri.counts.genome))
names(human.trinucleotides) <- row.names(tri.counts.genome)
trinucleotide.freq.factor <- sapply(unique(types), function(x) {
  freq.worm <- worm.trinucleotides[x] + worm.trinucleotides[as.character(reverseComplement(DNAString(x)))]
  return(freq.worm /  human.trinucleotides[x]) # tri.counts.genome is already classified w.r.t. pyrimidine reference
})
human.trinucleotides <- as.vector(t(tri.counts.exome)) # / sum(tri.counts.genome))) # counts from "deconstructSigs" package
names(human.trinucleotides) <- row.names(tri.counts.exome)
trinucleotide.freq.factor.ex <- sapply(unique(types), function(x) {
  freq.worm <- worm.trinucleotides[x] + worm.trinucleotides[as.character(reverseComplement(DNAString(x)))]
  return(freq.worm / human.trinucleotides[x]) # tri.counts.genome is already classified w.r.t. pyrimidine reference
})
names(trinucleotide.freq.factor.ex) = names(trinucleotide.freq.factor) <- unique(types)

Visualize trinucleotide differences between C.elegans and human exome:

C. elegans MMR mutational patterns

Upload the mutation counts from C. elegans samples and calculate C. elegans signatures using additive Poisson model:

\(Y_{i,j} = Pois(\lambda_{i,j})\),

\(E[Y_{i,j}] = N \cdot (\beta_{j,b} + X_{g_{1}} \beta_{j,g_{1}} + X_{g_{2}} \beta_{j,g_{2}} + X_{g_{1}:g_{2}} \beta_{j,g_{1}:g_{2}})\),

where \(β_{j,\cdot} \ge 0\) - effects, \(N\) - generation number, \(g_{1}\), \(g_{2}\) - genetic backgrounds, \(b\) - background contribution, \(X_{...} \in {0,1}\) indicates the presence of particular factors.

load("C. elegans data/Learned_signatures.RData")
# Contains mutation counts matrix mut_mat and exposure matrix small.X for the samples from C. elegans MMR and pole-4;pms-2 experiments.
learned.sigs <- nmSolve(t(mut_mat),small.X,maxIter=10000, tol = 1e-06, div.err = 1e-10)
for (i in 1:nrow(learned.sigs)) {
  learned.sigs[i,] <- learned.sigs[i,] / sum(learned.sigs[i,])
}
plot_sig(t(learned.sigs))

Humanize the signatures:

learned.sigs.exome <- learned.sigs
for (i in 1:nrow(learned.sigs.exome)) {
  learned.sigs.exome[i,] <- learned.sigs.exome[i,] / trinucleotide.freq.factor.ex[types]
}
plot_sig(t(learned.sigs.exome))

ICGC data

ICGC data for COAD and STAD dataset was downloaded from ICGC DCC using COAD-US and STAD-US projects with WXS analysis type, respectively. The vcf file describing all somatic mutations across ICGC dataset is stored here under the ‘simple_somatic_mutation.aggregated.vcf.gz’ name.

# download and read huge aggregated variant file
big.vcf <- readVcf("simple_somatic_mutation.aggregated.vcf")
mutations.COAD <- read.table(file="COAD/simple_somatic_mutation.open.tsv",sep="\t",header=T)
mutations.STAD <- read.table(file="STAD/simple_somatic_mutation.open.tsv",sep="\t",header=T)
per_sample_list <- sapply(unique(mutations.COAD$icgc_donor_id), function(donor) {
  unique(mutations.COAD$icgc_mutation_id[mutations.COAD$icgc_donor_id==donor])
})
names(per_sample_list) <- unique(mutations.COAD$icgc_donor_id)
vcf_list_COAD <- sapply(per_sample_list, function(x) big.vcf[as.character(x)])
names(vcf_list_COAD) <- names(per_sample_list)
vcf_list_COAD <- sapply(vcf_list_COAD, rowRanges)
for (sample in names(vcf_list_COAD)) {
  seqlevels(vcf_list_COAD[[sample]]) <- seqnames(get(ref_genome))[c(as.numeric(seqlevels(vcf_list_COAD[[sample]])[1:22]),25,23,24)]
  vcf_list_COAD[[sample]] <- vcf_list_COAD[[sample]][seqnames(vcf_list_COAD[[sample]])!="chrM"]
  vcf_list_COAD[[sample]] <- vcf_list_COAD[[sample]][seqnames(vcf_list_COAD[[sample]])!="chrY"]
}
per_sample_list <- sapply(unique(mutations.STAD$icgc_donor_id), function(donor) {
  unique(mutations.STAD$icgc_mutation_id[mutations.STAD$icgc_donor_id==donor])
})
names(per_sample_list) <- unique(mutations.STAD$icgc_donor_id)
vcf_list_STAD <- sapply(per_sample_list, function(x) big.vcf[as.character(x)])
names(vcf_list_STAD) <- names(per_sample_list)
vcf_list_STAD <- sapply(vcf_list_STAD, rowRanges)
for (sample in names(vcf_list_STAD)) {
  seqlevels(vcf_list_STAD[[sample]]) <- seqnames(get(ref_genome))[c(as.numeric(seqlevels(vcf_list_STAD[[sample]])[1:22]),25,23,24)]
  vcf_list_STAD[[sample]] <- vcf_list_STAD[[sample]][seqnames(vcf_list_STAD[[sample]])!="chrM"]
  vcf_list_STAD[[sample]] <- vcf_list_STAD[[sample]][seqnames(vcf_list_STAD[[sample]])!="chrY"]
}

all.types <- c(types.full,"INS_A","INS_C","INS_G","INS_T","DEL_A","DEL_C","DEL_G","DEL_T")
sub_list <- sapply(vcf_list_COAD, function(vcf) {
  vcf[width(vcf$REF)==1 & width(unlist(vcf$ALT))==1,]
})
del_list <- sapply(vcf_list_COAD, function(vcf) {
  vcf[width(vcf$REF)==2 & width(unlist(vcf$ALT))==1,]
})
ins_list <- sapply(vcf_list_COAD, function(vcf) {
  vcf[width(vcf$REF)==1 & width(unlist(vcf$ALT))==2,]
})
COAD.mutation.counts = matrix(0,nrow=length(vcf_list_COAD),ncol=length(all.types),dimnames=list(names(vcf_list_COAD),all.types))
for (i in 1:nrow(COAD.mutation.counts)) {
  type_context = type_context(sub_list[[i]], ref_genome)
  counts <- table(type_context)
  for (a in rownames(counts)) {
    tmp = unlist(strsplit(a,split="[>]"))
    inds <- colnames(counts)[counts[a,]>0]
    columns = as.vector(sapply(inds, function(x) paste(substr(x,1,1),"[",a,"]",substr(x,nchar(x),nchar(x)),sep="")))
    COAD.mutation.counts[i,columns] = counts[a,inds]
  }
  COAD.mutation.counts[i,97:100] <- table(substr(unlist(ins_list[[i]]$ALT),2,2))[c("A","C","G","T")]
  COAD.mutation.counts[i,101:104] <- table(substr(del_list[[i]]$REF,2,2))[c("A","C","G","T")]
  print(i)
}
COAD.mutation.counts[is.na(COAD.mutation.counts)] <- 0

sub_list <- sapply(vcf_list_STAD, function(vcf) {
  vcf[width(vcf$REF)==1 & width(unlist(vcf$ALT))==1,]
})
del_list <- sapply(vcf_list_STAD, function(vcf) {
  vcf[width(vcf$REF)==2 & width(unlist(vcf$ALT))==1,]
})
ins_list <- sapply(vcf_list_STAD, function(vcf) {
  vcf[width(vcf$REF)==1 & width(unlist(vcf$ALT))==2,]
})
STAD.mutation.counts = matrix(0,nrow=length(vcf_list_STAD),ncol=length(all.types),dimnames=list(names(vcf_list_STAD),all.types))
for (i in 1:nrow(STAD.mutation.counts)) {
  type_context = type_context(sub_list[[i]], ref_genome)
  counts <- table(type_context)
  for (a in rownames(counts)) {
    tmp = unlist(strsplit(a,split="[>]"))
    inds <- colnames(counts)[counts[a,]>0]
    columns = as.vector(sapply(inds, function(x) paste(substr(x,1,1),"[",a,"]",substr(x,nchar(x),nchar(x)),sep="")))
    STAD.mutation.counts[i,columns] = counts[a,inds]
  }
  STAD.mutation.counts[i,97:100] <- table(substr(unlist(ins_list[[i]]$ALT),2,2))[c("A","C","G","T")]
  STAD.mutation.counts[i,101:104] <- table(substr(del_list[[i]]$REF,2,2))[c("A","C","G","T")]
  print(i)
}
STAD.mutation.counts[is.na(STAD.mutation.counts)] <- 0

Or just upload the prepared data:

load("ICGC data/profiles_and_decomposition.RData")

Signature extraction

Signature extraction from the whole set is performed via Brunet version non-negative matrix factorization using NMF package. The number of signatures is chosen as the number of signatures where both residual sum of squares and Akakike Information Criterion values stabilize.

Plot AIC:

df = data.frame(rank=2:12,AIC)
ggplot(df,aes(x=rank,y=AIC)) + geom_point() + 
  ggtitle("AIC for selecting the number of signatures") + 
  geom_hline(yintercept = AIC[7],linetype = "longdash",colour="red")

Now plot RSS per rank:

df = data.frame(rank=2:12,rss)
ggplot(df,aes(x=rank,y=rss)) + geom_point() + 
  ggtitle("RSS for selecting the number of signatures") + 
  geom_hline(yintercept = rss[7],linetype = "longdash",colour="red")

This is how the final set of signatures looks like:

res <- NMF::nmf(x=mut_mat,rank=8,seed=123456,method='brunet')
sigs <- NMF::basis(res)
sigs <- sigs[,c(8,1,2,5,4,7,3,6)] # Reorder the signatures
decomposition <- t(NMF::coef(res))
decomposition <- decomposition[,c(8,1,2,5,4,7,3,6)]
colnames(sigs) <- c("Clock-1", "Clock-2", "POLE", "17-like", "MMR-1", "MMR-2", "MMR-3", "SNP")
colnames(decomposition) <- colnames(sigs)
for (i in 1:nrow(decomposition))
   decomposition[i,] = decomposition[i,]/sum(decomposition[i,])
plot_sig_104(sigs,size=12)

Signatures are assigned as follows: Clock-1 (5meC), Clock-2 (+APOBEC), POLE, 17-like, MMR-1 (20), MMR-2 (15), MMR-3 (21), MMR-4 (unknown).

Get microsatellite stability/instability (MSS/MSI status) from UCSC (or rather the Cancer Genome Atlas Clinical Explorer):

icgc.coad <- rownames(COAD.mutation.counts)
icgc.stad <- rownames(STAD.mutation.counts)
coad <- read.delim('ICGC data/COAD/donor.tsv',header=T)
stad <- read.delim('ICGC data/STAD/donor.tsv',header=T)
tcga.coad <- as.character(coad$submitted_donor_id[match(icgc.coad, coad$icgc_donor_id)])
tcga.stad <- as.character(stad$submitted_donor_id[match(icgc.stad, stad$icgc_donor_id)])

mss_coadread <- read.delim('ICGC data/COAD/COADREAD_2015-04-02_ClinicalParameters.txt',header=T)
mss_stad <- read.delim('ICGC data/STAD/STAD_2015-04-02_ClinicalParameters.txt',header=T)
mmr.coad <- icgc.coad[match(intersect(mss_coadread$SampleCode[mss_coadread$MSIstatus=='MSI-H'], tcga.coad),tcga.coad)]
mmr.stad <- icgc.stad[match(intersect(mss_stad$SampleCode[mss_stad$MSIstatus=='MSI-H'], tcga.stad),tcga.stad)]
mmr.ucsc <- c(mmr.stad, mmr.coad)

To assess the distribution of signatures in the data, we generate tSNE plot based on cosine similarity between sample profiles. Circle sizes reflect number of mutations, black rim - microsatellite instability (MSI high), coloured sectors correspond to relative contributions of different signatures. Note that all MSI samples group in one cluster across both datasets.

Visualize the similarity map:

MMR

Let’s plot signature decomposition for MSI samples in both datasets:

Association with MSI status: calculate P-values for relative signature contributions compared with MSI status.

pvals <- NULL
for (i in 1:8)
{
  pvals <- c(pvals, wilcox.test(decomposition[mmr.ucsc,i], decomposition[setdiff(rownames(mm),mmr.ucsc),i], paired=F,alternative='greater')$p.value)
}
names(pvals) = colnames(decomposition)
print(p.adjust(pvals,method = 'bonf')) # Take 0.01 as significance threshold (>10 tests)
##      Clock-1      Clock-2         POLE      17-like        MMR-1 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.720233e-51 
##        MMR-2        MMR-3          SNP 
## 2.695689e-22 9.806070e-29 1.000000e+00

MMR-1 seems to be a good indicator of MSI status. Calculate AUC for MMR-1:

library(pROC)
mmr.type <- sapply(rownames(mm), function(x) {
  if (x %in% mmr.ucsc) return("MSI")
  else return("MSS")
})
roc(response = factor(mmr.type), predictor = decomposition[,5])
## 
## Call:
## roc.default(response = factor(mmr.type), predictor = decomposition[,     5])
## 
## Data: decomposition[, 5] in 103 controls (factor(mmr.type) MSI) > 402 cases (factor(mmr.type) MSS).
## Area under the curve: 0.9847
ci(response = factor(mmr.type), predictor = decomposition[,5])
## 95% CI: 0.969-1 (DeLong)

Boxplots for signature MMR-1, MMR-2, MMR-3 contribution vs MSI status:

## Using sample, mmr.type as id variables

As we can see from the signature plot, all indels are coming with MMR-1; to confirm it, plot the absolute contribution of MMR signatures vs the number of indels:

To see the unusual contribution of Clock-1 signature, plot the average fold change (log scale) in the number of mutations assigned to different signatures in MSI samples and MSS samples:

Relative contributions for MMR deficient samples:

Absolute contributions for all the samples in both datasets:

Amount of indels

Plot the amount of indels per sample in both cohorts:

Lets find all the homopolymers of length 4 to 55 in human exome.

  1. Create homopolymer library
homopolymer_pool_length_4to55 <- lapply(seq(4,55),function(y) lapply(c("A","C","G","T"),function(x)
  if (x=="A") {paste0("B",paste(rep(x,y),collapse=""),"B")
  }
  else
    if (x=="T") {paste0("V",paste(rep(x,y),collapse=""),"V")
    }
  else
    if (x=="G") {paste0("H",paste(rep(x,y),collapse=""),"H")
    }
  else
    if (x=="C") {paste0("D",paste(rep(x,y),collapse=""),"D")
    }
))
homopolymers <- unlist(homopolymer_pool_length_4to55)
pattern_all <- DNAStringSet(homopolymers)
  1. Find all homopolymers in the genome:
hits_all <- list()
for (i in 1:23) {
  maskMotif(a[[i]], "N") -> masked
  hits_all[[i]] <- sapply(pattern_all, 
                          matchPattern, 
                          subject=masked,
                          fixed=F)
}
lengths <- lapply(hits_all, function(l) sapply(l,length))
hits_all <- lapply(1:length(hits_all), function(i) hits_all[[i]][which(lengths[[i]]>0)])
nonzero_lengths <- lapply(hits_all, function(chr) sapply(chr, length)) # numbers of homopolymers on each chromosome for non empty classes only
names(hits_all) <- seqlevels(gr)
  1. Create a dataframe withtheir coordinates and contexts:
sites.gr <- lapply(1:23, function(i) do.call("c",lapply(lapply(hits_all[[i]], as, "IRanges"),
                                                        GRanges,seqnames=names(hits_all)[i])))
for (j in 1:23) {
  sites.gr[[j]]$pattern.searched <- rep(as.character(pattern_all), lengths[[j]])
  sites.gr[[j]]$motif.found <- unlist(lapply(hits_all[[j]], as.character))
  sites.gr[[j]]$pattern.length <- unlist(lapply(hits_all[[j]], width)) # includes the two flanking bases
  sites.gr[[j]]$homopolymer.length <- (sites.gr[[j]]$pattern.length)-2 # remove 2 bases, 5' and 3' are not part of homopolymer
}
sites.gr_all <- do.call("c",sites.gr)
genome(sites.gr_all) <- "hg19" # add genome info
  1. Intersect with well-covereed part of the human exome:
exomranges <- GRanges(seqnames=exome$V1,ranges=IRanges(start = exome$V2,end=exome$V3))
hits <- findOverlaps(exomranges,sites.gr_all,minoverlap = 2)
sites.gr_all <- sites.gr_all[sort(unique(subjectHits(hits)))]
all_sites <- as.data.frame(sites.gr_all) # make a data frame
  1. Check the amount of 1bp indels in homopolymers in MSI samples in both cohorts:
# Upload the vcf files for both cohorts
load("ICGC data/STAD/vcf_list_STAD.RData")
indel_list_STAD <- sapply(vcf_list_STAD, function(vcf) vcf[abs(width(vcf$REF)-width(unlist(vcf$ALT)))==1 & (width(vcf$REF)==1 | width(unlist(vcf$ALT))==1),])
dims <- sapply(indel_list_STAD,length,simplify = T)
load("ICGC data/COAD/vcf_list_COAD.RData")
indel_list_COAD <- sapply(vcf_list_COAD, function(vcf) vcf[abs(width(vcf$REF)-width(unlist(vcf$ALT)))==1 & (width(vcf$REF)==1 | width(unlist(vcf$ALT))==1),])
dims <- sapply(indel_list_COAD,length,simplify = T)
for (k in 1:length(indel_list_STAD)) {
  genome(indel_list_STAD[[k]]) <- "hg19"
  indel_list_STAD[[k]]$insertion <- width(unlist(indel_list_STAD[[k]]$ALT))-1
  indel_list_STAD[[k]]$deletion <- width(indel_list_STAD[[k]]$REF)-1
}
for (k in 1:length(indel_list_COAD)) {
  genome(indel_list_COAD[[k]]) <- "hg19"
  indel_list_COAD[[k]]$insertion <- width(unlist(indel_list_COAD[[k]]$ALT))-1
  indel_list_COAD[[k]]$deletion <- width(indel_list_COAD[[k]]$REF)-1
}
indels.in.hp.stad <- vector("numeric",length(indel_list_STAD))
for (i in 1:length(indel_list_STAD)) {
  x <- subsetByOverlaps(indel_list_STAD[[i]],sites.gr_all)
  indels.in.hp.stad[i] <- nrow(as.data.frame(x))
}
names(indels.in.hp.stad) <- names(indel_list_STAD)
indels.in.hp.coad <- vector("numeric",length(indel_list_COAD))
for (i in 1:length(indel_list_COAD)) {
  x <- subsetByOverlaps(indel_list_COAD[[i]],sites.gr_all)
  indels.in.hp.coad[i] <- nrow(as.data.frame(x))
}
names(indels.in.hp.coad) <- names(indel_list_COAD)

Plot indels in homopolymers:

More about signature contributions

Same boxplots for realtive contributions in MSI and MSS samples for all signatures simultaneously:

Absolute contributions:

Similarities

We calculate cosine similarities between COSMIC signatures and de novo signature set; and also between C.elegans signatures and all if the abovementioned. Cosine similarity score measures the cosine of an angle between two vectors:

\(Sim(a,b) = \frac{<a,b>}{||a||*||b||}\)

cosine <- function(x,y) {
  x %*% y / sqrt(sum(x**2)) / sqrt(sum(y**2))
}

The higher the similarity, the closer the vectors are to each other.

Similarities between de novo signatures and C. elegans derived mlh-1, pms-2 and pole-4;pms-2 mutational patterns:

##         mlh.1 pms.2 pole.4.pms.2
## Clock-1  0.15  0.18         0.17
## Clock-2  0.41  0.36         0.32
## POLE     0.17  0.19         0.21
## 17-like  0.08  0.23         0.21
## MMR-1    0.41  0.81         0.85
## MMR-2    0.17  0.36         0.43
## MMR-3    0.29  0.62         0.56
## SNP      0.27  0.52         0.47

Similarities between de novo signatures and COSMIC cancer signatures:

##         Signature.1 Signature.2 Signature.3 Signature.4 Signature.5
## Clock-1        0.98        0.11        0.14        0.12        0.53
## Clock-2        0.25        0.62        0.75        0.55        0.66
## POLE           0.32        0.26        0.19        0.17        0.31
## 17-like        0.06        0.04        0.22        0.06        0.26
## MMR-1          0.59        0.08        0.43        0.43        0.71
## MMR-2          0.67        0.04        0.14        0.11        0.49
## MMR-3          0.39        0.09        0.34        0.19        0.62
## SNP            0.73        0.14        0.48        0.35        0.83
##         Signature.6 Signature.7 Signature.8 Signature.9 Signature.10
## Clock-1        0.86        0.23        0.42        0.39         0.36
## Clock-2        0.18        0.57        0.63        0.47         0.13
## POLE           0.24        0.41        0.32        0.39         0.97
## 17-like        0.08        0.05        0.16        0.61         0.06
## MMR-1          0.79        0.17        0.55        0.52         0.24
## MMR-2          0.86        0.09        0.34        0.34         0.21
## MMR-3          0.42        0.17        0.35        0.48         0.16
## SNP            0.69        0.32        0.52        0.60         0.25
##         Signature.11 Signature.12 Signature.13 Signature.14 Signature.15
## Clock-1         0.16         0.12         0.03         0.63         0.52
## Clock-2         0.54         0.39         0.50         0.26         0.12
## POLE            0.13         0.12         0.08         0.25         0.18
## 17-like         0.05         0.27         0.02         0.07         0.06
## MMR-1           0.35         0.46         0.04         0.85         0.70
## MMR-2           0.25         0.14         0.03         0.87         0.99
## MMR-3           0.26         0.79         0.03         0.40         0.35
## SNP             0.38         0.62         0.06         0.52         0.38
##         Signature.16 Signature.17 Signature.18 Signature.19 Signature.20
## Clock-1         0.12         0.20         0.25         0.48         0.53
## Clock-2         0.68         0.19         0.45         0.52         0.31
## POLE            0.24         0.18         0.37         0.19         0.24
## 17-like         0.36         0.98         0.04         0.08         0.12
## MMR-1           0.47         0.20         0.35         0.62         0.85
## MMR-2           0.14         0.12         0.20         0.52         0.30
## MMR-3           0.54         0.18         0.16         0.41         0.53
## SNP             0.61         0.29         0.31         0.65         0.79
##         Signature.21 Signature.22 Signature.23 Signature.24 Signature.25
## Clock-1         0.42         0.07         0.15         0.12         0.47
## Clock-2         0.19         0.22         0.39         0.44         0.55
## POLE            0.17         0.03         0.09         0.15         0.33
## 17-like         0.12         0.07         0.03         0.03         0.20
## MMR-1           0.43         0.08         0.43         0.32         0.50
## MMR-2           0.22         0.05         0.32         0.21         0.33
## MMR-3           0.95         0.08         0.28         0.14         0.45
## SNP             0.63         0.13         0.39         0.25         0.60
##         Signature.26 Signature.27 Signature.28 Signature.29 Signature.30
## Clock-1         0.46         0.16         0.08         0.45         0.38
## Clock-2         0.27         0.25         0.13         0.49         0.68
## POLE            0.23         0.09         0.21         0.27         0.21
## 17-like         0.20         0.04         0.73         0.05         0.11
## MMR-1           0.65         0.22         0.17         0.48         0.50
## MMR-2           0.49         0.18         0.06         0.33         0.38
## MMR-3           0.91         0.16         0.18         0.28         0.36
## SNP             0.71         0.21         0.23         0.47         0.55

Age correlation

Check if contribution of any signature correlates with age (answer - no).

Correlation with relative contributions:

##     Clock-1     Clock-2        POLE     17-like       MMR-1       MMR-2 
##  0.06675453 -0.02832896 -0.01666071 -0.04382598  0.12239589  0.11216179 
##       MMR-3         SNP 
##  0.03552525 -0.01160837

Correlation with absolute contributions:

##    Clock-1    Clock-2       POLE    17-like      MMR-1      MMR-2 
## 0.21618104 0.21717069 0.15605675 0.07358448 0.19466476 0.20038145 
##      MMR-3        SNP 
## 0.11834584 0.11104834

Average profiles

Plot averaged profiles of MSI samples in COAD and STAD datasets. Their cosine similarity shows that they are nearly identical.

##           [,1]
## [1,] 0.9946953